8.
(a) Use the read.csv() function to read the data into R. Call the loaded data college. Make sure that you have the directory set to the correct location for the data.
college = read.csv("/Users/ranjiang/Desktop/dis/dataset/College.csv")
(b)
#rename the first column(id->college name)
rownames(college) <- college[,1]
#Look at the data using the View() function.
View(college)
#eliminate the first column
college <- college[,-1]
View(college)
(c)
i. Use the summary() function to produce a numerical summary of the variables in the data set.
summary(college)
## Private Apps Accept Enroll
## Length:777 Min. : 81 Min. : 72 Min. : 35
## Class :character 1st Qu.: 776 1st Qu.: 604 1st Qu.: 242
## Mode :character Median : 1558 Median : 1110 Median : 434
## Mean : 3002 Mean : 2019 Mean : 780
## 3rd Qu.: 3624 3rd Qu.: 2424 3rd Qu.: 902
## Max. :48094 Max. :26330 Max. :6392
## Top10perc Top25perc F.Undergrad P.Undergrad
## Min. : 1.00 Min. : 9.0 Min. : 139 Min. : 1.0
## 1st Qu.:15.00 1st Qu.: 41.0 1st Qu.: 992 1st Qu.: 95.0
## Median :23.00 Median : 54.0 Median : 1707 Median : 353.0
## Mean :27.56 Mean : 55.8 Mean : 3700 Mean : 855.3
## 3rd Qu.:35.00 3rd Qu.: 69.0 3rd Qu.: 4005 3rd Qu.: 967.0
## Max. :96.00 Max. :100.0 Max. :31643 Max. :21836.0
## Outstate Room.Board Books Personal
## Min. : 2340 Min. :1780 Min. : 96.0 Min. : 250
## 1st Qu.: 7320 1st Qu.:3597 1st Qu.: 470.0 1st Qu.: 850
## Median : 9990 Median :4200 Median : 500.0 Median :1200
## Mean :10441 Mean :4358 Mean : 549.4 Mean :1341
## 3rd Qu.:12925 3rd Qu.:5050 3rd Qu.: 600.0 3rd Qu.:1700
## Max. :21700 Max. :8124 Max. :2340.0 Max. :6800
## PhD Terminal S.F.Ratio perc.alumni
## Min. : 8.00 Min. : 24.0 Min. : 2.50 Min. : 0.00
## 1st Qu.: 62.00 1st Qu.: 71.0 1st Qu.:11.50 1st Qu.:13.00
## Median : 75.00 Median : 82.0 Median :13.60 Median :21.00
## Mean : 72.66 Mean : 79.7 Mean :14.09 Mean :22.74
## 3rd Qu.: 85.00 3rd Qu.: 92.0 3rd Qu.:16.50 3rd Qu.:31.00
## Max. :103.00 Max. :100.0 Max. :39.80 Max. :64.00
## Expend Grad.Rate
## Min. : 3186 Min. : 10.00
## 1st Qu.: 6751 1st Qu.: 53.00
## Median : 8377 Median : 65.00
## Mean : 9660 Mean : 65.46
## 3rd Qu.:10830 3rd Qu.: 78.00
## Max. :56233 Max. :118.00
#pairs(college[,1:10]) directly will fail since college$Private is "yes" or "no". class(college$Private) ->"character"
#use factor( ) or as.factor( )
#Performance: as.factor > factor when input is a factor/ integer
college$Private <- as.factor(college$Private)
#college[,1:10]
pairs(college[,1:10],col = "pink")
#class(college$Private) ->factor, hence, boxplot rather than scatter plot
#colors()
plot(college$Private, college$Outstate,col = c("orange","palegreen2"),xlab = "Private", ylab = "Outstate")
college$Elite <- college$Top10perc
college$Elite <- rep("No",nrow(college))
college$Elite[college$Top10perc>50] <- "Yes"
college$Elite <- as.factor(college$Elite)
#head(college)
college <- data.frame(college, college$Elite)
summary(college$Elite)
## No Yes
## 699 78
plot(college$Elite,college$Outstate,col=c("pink","lemonchiffon1"))
#head(college)
par(mfrow = c(2, 2))
#breaks = c(,,...)
hist(college$Apps, col = "skyblue", xlab = "APPs",main = "Number of applications received")
hist(college$Accept, col = "skyblue", xlab = "Accept",main = "Number of applicants accepted")
hist(college$Enroll, col = "skyblue", xlab= "Enroll", main = "Enroll")
hist(college$Top10perc, col = "skyblue", xlab= "Top10perc", main = "Top10perc")
#head(college)
#college$AcceptanceRate <- college$Accept/college$Apps*100
par(mfrow = c(1, 2))
plot(college$Outstate,college$Grad.Rate,col="skyblue",xlab = "Out-of-state tuition",ylab = "Graduation rate")
plot(college$Top10perc,college$Grad.Rate,col="skyblue",xlab = "Top10perc",ylab = "Graduation rate")
lm(college$Grad.Rate~college$Outstate+college$Top10perc)
##
## Call:
## lm(formula = college$Grad.Rate ~ college$Outstate + college$Top10perc)
##
## Coefficients:
## (Intercept) college$Outstate college$Top10perc
## 39.546160 0.001829 0.247416
#Waiting for supplements...
9. This exercise involves the Auto data set studied in the lab. Make sure that the missing values have been removed from the data.
auto <- read.csv("/Users/ranjiang/Desktop/dis/dataset/Auto.csv")
cleanedAuto <- na.omit(auto)
cleanedAuto <- cleanedAuto[-which(cleanedAuto$horsepower=="?"),]
(a) Which of the predictors are quantitative, and which are qualitative?
# quantitative:mpg, cylinders, displacement, horsepower, weight, acceleration, year
# qualitative:name, origin
(b) What is the range of each quantitative predictor? You can answer this using the range() function. range()
#sapply() function takes list, vector or data frame as input and gives output in vector or matrix.
sapply(cleanedAuto[,1:7],range)
## mpg cylinders displacement horsepower weight acceleration year
## [1,] "9" "3" "68" "100" "1613" "8" "70"
## [2,] "46.6" "8" "455" "98" "5140" "24.8" "82"
(c) What is the mean and standard deviation of each quantitative predictor?
cleanedAuto$horsepower <- as.numeric(cleanedAuto$horsepower)
sapply(cleanedAuto[,1:7],mean)
## mpg cylinders displacement horsepower weight acceleration
## 23.445918 5.471939 194.411990 104.469388 2977.584184 15.541327
## year
## 75.979592
sapply(cleanedAuto[,1:7],sd)
## mpg cylinders displacement horsepower weight acceleration
## 7.805007 1.705783 104.644004 38.491160 849.402560 2.758864
## year
## 3.683737
(d) Now remove the 10th through 85th observations. What is the range, mean, and standard deviation of each predictor in the subset of the data that remains?
newAuto <- cleanedAuto[-(10:85),]
sapply(newAuto[,1:7],mean)
## mpg cylinders displacement horsepower weight acceleration
## 24.404430 5.373418 187.240506 100.721519 2935.971519 15.726899
## year
## 77.145570
sapply(newAuto[,1:7],sd)
## mpg cylinders displacement horsepower weight acceleration
## 7.867283 1.654179 99.678367 35.708853 811.300208 2.693721
## year
## 3.106217
(e) Using the full data set, investigate the predictors graphically, using scatterplots or other tools of your choice. Create some plots highlighting the relationships among the predictors. Comment on your findings.
#overall
pairs(cleanedAuto[,1:7],col="orange")
#mpg and year/acceleration are positively correlated
#mpg and displacement/horsepower/weight are negatively correlated
#displacement and horsepower/weight are positively correlated
#displacement and acceleration are negatively correlated
#horsepower and weight are positively correlated
#weight and acceleration are negatively correlated
(f) Suppose that we wish to predict gas mileage (mpg) on the basis of the other variables. Do your plots suggest that any of the other variables might be useful in predicting mpg? Justify your answer.
#mpg and year are positively correlated
#mpg and weight are negatively correlated
summary(lm(cleanedAuto$mpg~cleanedAuto$year+cleanedAuto$acceleration
+cleanedAuto$displacement+cleanedAuto$horsepower+cleanedAuto$weight))
##
## Call:
## lm(formula = cleanedAuto$mpg ~ cleanedAuto$year + cleanedAuto$acceleration +
## cleanedAuto$displacement + cleanedAuto$horsepower + cleanedAuto$weight)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.5211 -2.3920 -0.1036 2.0312 14.2874
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.544e+01 4.677e+00 -3.300 0.00106 **
## cleanedAuto$year 7.541e-01 5.261e-02 14.334 < 2e-16 ***
## cleanedAuto$acceleration 9.032e-02 1.019e-01 0.886 0.37599
## cleanedAuto$displacement 2.782e-03 5.462e-03 0.509 0.61082
## cleanedAuto$horsepower 1.020e-03 1.376e-02 0.074 0.94095
## cleanedAuto$weight -6.874e-03 6.653e-04 -10.333 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.435 on 386 degrees of freedom
## Multiple R-squared: 0.8088, Adjusted R-squared: 0.8063
## F-statistic: 326.5 on 5 and 386 DF, p-value: < 2.2e-16
10. This exercise involves the Boston housing data set.
(a) To begin, load in the Boston data set. The Boston data set is part of the ISLR2 library. How many rows are in this data set? How many columns? What do the rows and columns represent?
Boston <- read.csv("/Users/ranjiang/Desktop/dis/dataset/Boston.csv")
#crim - Per capita crime rate by town
#zn - Proportion of residential land zoned for lots over 25,000 sq.ft.
#indus - Proportion of non-retail business acres per town
#chas - Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
#nox - Nitrogen oxides concentration (parts per 10 million)
#rm - Average number of rooms per dwelling
#age - Proportion of owner-occupied units built prior to 1940
#dis - Weighted mean of distances to five Boston employment centres
#rad - Index of accessibility to radial highways
#tax - Full-value property-tax rate per $10,000
#ptratio - Pupil-teacher ratio by town
#lstat - Lower status of the population (percent)
#medv - Median value of owner-occupied homes in $1000s
dim(Boston)
## [1] 506 14
#rows = 506, samples
#columns = 14, variables/features
(b) Make some pairwise scatterplots of the predictors (columns) in this data set. Describe your findings.
head(Boston)
## X crim zn indus chas nox rm age dis rad tax ptratio lstat medv
## 1 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 4.98 24.0
## 2 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 9.14 21.6
## 3 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 4.03 34.7
## 4 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 2.94 33.4
## 5 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 5.33 36.2
## 6 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 5.21 28.7
pairs(Boston,col="lightblue")
(c) Are any of the predictors associated with per capita crime rate? If so, explain the relationship.
par(mfrow = c(1,2))
plot(Boston$dis, Boston$crim,col = "grey1",xlab = "disance", ylab = "crime")
# Closer to Boston employment centres, more crime
plot(Boston$age, Boston$crim,col = "grey1",xlab = "age",ylab = "crime")
# Older owner-occupied units, more crime
(d) Do any of the census tracts of Boston appear to have particularly high crime rates? Tax rates? Pupil-teacher ratios? Comment on the range of each predictor.
lapply(data.frame(crime = Boston$crim,tax = Boston$tax,patratio =Boston$ptratio), range)
## $crime
## [1] 0.00632 88.97620
##
## $tax
## [1] 187 711
##
## $patratio
## [1] 12.6 22.0
#ranges are large
par(mfrow=c(2,2))
#Per capita crime rate by town > 1
hist(Boston$crim[Boston$crim>1],breaks=25, xlab = "Crime rates", main = "Crime rates in different towns",col = "lightblue")
# most cities have low crime rates
hist(Boston$tax, breaks=25, xlab = "Tax", main = "Tax in different towns",col = "lightblue")
# there is a large gap between suburbs with low tax and high tax
hist(Boston$ptratio, breaks=25, xlab = "Pupil-teacher ratio", main = "Pupil-teacher ratio in different towns",col = "lightblue")
#The range of it is 12.6 to 22.0, hence, it isn't high
(e) How many of the census tracts in this data set bound the Charles river?
dim(subset(Boston, chas == 1))
## [1] 35 14
#35
(f) What is the median pupil-teacher ratio among the towns in this data set?
median(Boston$ptratio)
## [1] 19.05
(g) Which census tract of Boston has lowest median value of owner occupied homes? What are the values of the other predictors for that census tract, and how do those values compare to the overall ranges for those predictors? Comment on your findings.
dim(Boston)#506 samples
## [1] 506 14
lowesetMedv <- t(subset(Boston, medv == min(Boston$medv)))
lowesetMedv#399 and 406
## 399 406
## X 399.0000 406.0000
## crim 38.3518 67.9208
## zn 0.0000 0.0000
## indus 18.1000 18.1000
## chas 0.0000 0.0000
## nox 0.6930 0.6930
## rm 5.4530 5.6830
## age 100.0000 100.0000
## dis 1.4896 1.4254
## rad 24.0000 24.0000
## tax 666.0000 666.0000
## ptratio 20.2000 20.2000
## lstat 30.5900 22.9800
## medv 5.0000 5.0000
Boston$rankAge <- rank(Boston$age, na.last = TRUE, ties.method = "max") #Ascend
c(x1=Boston[399,]$rankAge,x2=Boston[406,]$rankAge)#Both rankAge = 506, oldest
## x1 x2
## 506 506
#View(Boston)
(h) In this data set, how many of the census tracts average more than seven rooms per dwelling? More than eight rooms per dwelling? Comment on the census tracts that average more than eight rooms per dwelling.
dim(subset(Boston,Boston$rm>7))#64
## [1] 64 15
dim(subset(Boston,Boston$rm>8))#13
## [1] 13 15
summary(subset(Boston,Boston$rm>8))
## X crim zn indus
## Min. : 98.0 Min. :0.02009 Min. : 0.00 Min. : 2.680
## 1st Qu.:225.0 1st Qu.:0.33147 1st Qu.: 0.00 1st Qu.: 3.970
## Median :233.0 Median :0.52014 Median : 0.00 Median : 6.200
## Mean :232.3 Mean :0.71879 Mean :13.62 Mean : 7.078
## 3rd Qu.:258.0 3rd Qu.:0.57834 3rd Qu.:20.00 3rd Qu.: 6.200
## Max. :365.0 Max. :3.47428 Max. :95.00 Max. :19.580
## chas nox rm age
## Min. :0.0000 Min. :0.4161 Min. :8.034 Min. : 8.40
## 1st Qu.:0.0000 1st Qu.:0.5040 1st Qu.:8.247 1st Qu.:70.40
## Median :0.0000 Median :0.5070 Median :8.297 Median :78.30
## Mean :0.1538 Mean :0.5392 Mean :8.349 Mean :71.54
## 3rd Qu.:0.0000 3rd Qu.:0.6050 3rd Qu.:8.398 3rd Qu.:86.50
## Max. :1.0000 Max. :0.7180 Max. :8.780 Max. :93.90
## dis rad tax ptratio
## Min. :1.801 Min. : 2.000 Min. :224.0 Min. :13.00
## 1st Qu.:2.288 1st Qu.: 5.000 1st Qu.:264.0 1st Qu.:14.70
## Median :2.894 Median : 7.000 Median :307.0 Median :17.40
## Mean :3.430 Mean : 7.462 Mean :325.1 Mean :16.36
## 3rd Qu.:3.652 3rd Qu.: 8.000 3rd Qu.:307.0 3rd Qu.:17.40
## Max. :8.907 Max. :24.000 Max. :666.0 Max. :20.20
## lstat medv rankAge
## Min. :2.47 Min. :21.9 Min. : 10.0
## 1st Qu.:3.32 1st Qu.:41.7 1st Qu.:223.0
## Median :4.14 Median :48.3 Median :258.0
## Mean :4.31 Mean :44.2 Mean :243.2
## 3rd Qu.:5.12 3rd Qu.:50.0 3rd Qu.:308.0
## Max. :7.44 Max. :50.0 Max. :378.0
Why would I assign the result of lm() to lm.fit?
#library(MASS)
library(ISLR2)
##
## 载入程辑包:'ISLR2'
## The following object is masked _by_ '.GlobalEnv':
##
## Boston
auto <- read.csv("/Users/ranjiang/Desktop/dis/dataset/Auto.csv")
cleanedAuto <- na.omit(auto)
cleanedAuto <- cleanedAuto[-which(cleanedAuto$horsepower=="?"),]
cleanedAuto$horsepower <- as.numeric(cleanedAuto$horsepower)
lm.fit <- lm(mpg ~ horsepower, cleanedAuto)
summary(lm.fit)
##
## Call:
## lm(formula = mpg ~ horsepower, data = cleanedAuto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.5710 -3.2592 -0.3435 2.7630 16.9240
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.935861 0.717499 55.66 <2e-16 ***
## horsepower -0.157845 0.006446 -24.49 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.906 on 390 degrees of freedom
## Multiple R-squared: 0.6059, Adjusted R-squared: 0.6049
## F-statistic: 599.7 on 1 and 390 DF, p-value: < 2.2e-16
#Yes, since F-statistic >> 1 and p-value is close to 0, we have enough evidence to reject H_0
#Residual standard error(RSE): 4.906
mean(cleanedAuto$mpg, na.rm=T)
## [1] 23.44592
#Negative
predict(lm.fit, data.frame( horsepower=c(98) ), interval = "prediction")#[14.81,34.12]
## fit lwr upr
## 1 24.46708 14.8094 34.12476
predict(lm.fit, data.frame( horsepower=c(98) ), interval = "confidence") #[23.97,24.96]
## fit lwr upr
## 1 24.46708 23.97308 24.96108
plot(cleanedAuto$horsepower,cleanedAuto$mpg)
abline(lm.fit)
(c) Use the plot() function to produce diagnostic plots of the least squares regression fit. Comment on any problems you see with the fit.
par(mfrow = c(2,2))
plot(lm.fit)
# (1,1) "U"shape -> non-linear. Residuals are related to fitted values.
# (1,2) If satisfying Normal distribution assumption, points should be on the line
tmp <- cleanedAuto
tmp$name <- as.numeric(as.factor(tmp$name))
pairs(tmp)
(b) Compute the matrix of correlations between the variables using the function cor(). You will need to exclude the name variable, which is qualitative.
cor(subset(cleanedAuto[,1:8]))
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## acceleration year origin
## mpg 0.4233285 0.5805410 0.5652088
## cylinders -0.5046834 -0.3456474 -0.5689316
## displacement -0.5438005 -0.3698552 -0.6145351
## horsepower -0.6891955 -0.4163615 -0.4551715
## weight -0.4168392 -0.3091199 -0.5850054
## acceleration 1.0000000 0.2903161 0.2127458
## year 0.2903161 1.0000000 0.1815277
## origin 0.2127458 0.1815277 1.0000000
lm.fit1 <- lm(mpg~.-name,cleanedAuto)
summary(lm.fit1)
##
## Call:
## lm(formula = mpg ~ . - name, data = cleanedAuto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.5903 -2.1565 -0.1169 1.8690 13.0604
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.218435 4.644294 -3.707 0.00024 ***
## cylinders -0.493376 0.323282 -1.526 0.12780
## displacement 0.019896 0.007515 2.647 0.00844 **
## horsepower -0.016951 0.013787 -1.230 0.21963
## weight -0.006474 0.000652 -9.929 < 2e-16 ***
## acceleration 0.080576 0.098845 0.815 0.41548
## year 0.750773 0.050973 14.729 < 2e-16 ***
## origin 1.426141 0.278136 5.127 4.67e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.328 on 384 degrees of freedom
## Multiple R-squared: 0.8215, Adjusted R-squared: 0.8182
## F-statistic: 252.4 on 7 and 384 DF, p-value: < 2.2e-16
Comment on the output. For instance: i. Is there a relationship between the predictors and the response?
# Yes, since F-statistic >> 1, and p-value is closed to 0, we reject H_0.
# displacement, weight, year, origin
# 0.750773 > 0 -> y is positively related to year
par(mfrow = c(2,2))
plot(lm.fit1)
# large outliers: Yes, no.323,326,327 (see fig.(1,1)).
# high leverage: Yes, the no.14 (see fig.(2,2)).
# significant displacement, weight, year, origin
lm.fitInter <- lm(mpg ~ .-name + displacement*weight + cylinders * horsepower+ cylinders*year, cleanedAuto)
summary(lm.fitInter)
##
## Call:
## lm(formula = mpg ~ . - name + displacement * weight + cylinders *
## horsepower + cylinders * year, data = cleanedAuto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.5716 -1.5438 -0.0587 1.2227 12.6501
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.738e+01 1.432e+01 -3.309 0.001027 **
## cylinders 8.190e+00 2.758e+00 2.969 0.003172 **
## displacement -4.966e-02 1.317e-02 -3.770 0.000189 ***
## horsepower -1.415e-01 4.656e-02 -3.039 0.002535 **
## weight -8.076e-03 1.135e-03 -7.118 5.49e-12 ***
## acceleration 7.576e-03 9.404e-02 0.081 0.935836
## year 1.394e+00 1.624e-01 8.585 2.35e-16 ***
## origin 6.497e-01 2.524e-01 2.574 0.010433 *
## displacement:weight 1.458e-05 3.514e-06 4.149 4.12e-05 ***
## cylinders:horsepower 1.448e-02 6.415e-03 2.257 0.024585 *
## cylinders:year -1.241e-01 3.058e-02 -4.057 6.03e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.838 on 381 degrees of freedom
## Multiple R-squared: 0.8712, Adjusted R-squared: 0.8678
## F-statistic: 257.6 on 10 and 381 DF, p-value: < 2.2e-16
#significant: cylinders*year, displacement*weight, cylinders*horsepower,
lm.fitLog <- lm(mpg ~ .-name+log(horsepower),cleanedAuto)
summary(lm.fitLog) #F-statistic>>1 and p-value is closed to 0 -> linear relationship
##
## Call:
## lm(formula = mpg ~ . - name + log(horsepower), data = cleanedAuto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.5777 -1.6623 -0.1213 1.4913 12.0230
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.674e+01 1.106e+01 7.839 4.54e-14 ***
## cylinders -5.530e-02 2.907e-01 -0.190 0.849230
## displacement -4.607e-03 7.108e-03 -0.648 0.517291
## horsepower 1.764e-01 2.269e-02 7.775 7.05e-14 ***
## weight -3.366e-03 6.561e-04 -5.130 4.62e-07 ***
## acceleration -3.277e-01 9.670e-02 -3.388 0.000776 ***
## year 7.421e-01 4.534e-02 16.368 < 2e-16 ***
## origin 8.976e-01 2.528e-01 3.551 0.000432 ***
## log(horsepower) -2.685e+01 2.652e+00 -10.127 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.959 on 383 degrees of freedom
## Multiple R-squared: 0.8592, Adjusted R-squared: 0.8562
## F-statistic: 292.1 on 8 and 383 DF, p-value: < 2.2e-16
lm.fitSqrt <- lm(mpg ~ .-name+sqrt(horsepower),cleanedAuto)
summary(lm.fitSqrt) #F-statistic>>1 and p-value is closed to 0 -> linear relationship
##
## Call:
## lm(formula = mpg ~ . - name + sqrt(horsepower), data = cleanedAuto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.5402 -1.6717 -0.0778 1.4861 11.9754
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.299e+01 7.251e+00 5.929 6.82e-09 ***
## cylinders 6.037e-02 2.928e-01 0.206 0.836748
## displacement -5.870e-03 7.156e-03 -0.820 0.412560
## horsepower 4.239e-01 4.532e-02 9.353 < 2e-16 ***
## weight -3.285e-03 6.604e-04 -4.975 9.87e-07 ***
## acceleration -3.342e-01 9.705e-02 -3.443 0.000638 ***
## year 7.398e-01 4.536e-02 16.308 < 2e-16 ***
## origin 9.159e-01 2.526e-01 3.626 0.000326 ***
## sqrt(horsepower) -1.050e+01 1.039e+00 -10.104 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.961 on 383 degrees of freedom
## Multiple R-squared: 0.8591, Adjusted R-squared: 0.8561
## F-statistic: 291.8 on 8 and 383 DF, p-value: < 2.2e-16
lm.fitPower <- lm(mpg ~ .-name+ I(horsepower^2),cleanedAuto)
summary(lm.fitPower) #F-statistic>>1 and p-value is closed to 0 -> linear relationship
##
## Call:
## lm(formula = mpg ~ . - name + I(horsepower^2), data = cleanedAuto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.5497 -1.7311 -0.2236 1.5877 11.9955
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.3236564 4.6247696 0.286 0.774872
## cylinders 0.3489063 0.3048310 1.145 0.253094
## displacement -0.0075649 0.0073733 -1.026 0.305550
## horsepower -0.3194633 0.0343447 -9.302 < 2e-16 ***
## weight -0.0032712 0.0006787 -4.820 2.07e-06 ***
## acceleration -0.3305981 0.0991849 -3.333 0.000942 ***
## year 0.7353414 0.0459918 15.989 < 2e-16 ***
## origin 1.0144130 0.2545545 3.985 8.08e-05 ***
## I(horsepower^2) 0.0010060 0.0001065 9.449 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.001 on 383 degrees of freedom
## Multiple R-squared: 0.8552, Adjusted R-squared: 0.8522
## F-statistic: 282.8 on 8 and 383 DF, p-value: < 2.2e-16
anova(lm.fit1, lm.fitPower) #lm.fitPower is better
## Analysis of Variance Table
##
## Model 1: mpg ~ (cylinders + displacement + horsepower + weight + acceleration +
## year + origin + name) - name
## Model 2: mpg ~ (cylinders + displacement + horsepower + weight + acceleration +
## year + origin + name) - name + I(horsepower^2)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 384 4252.2
## 2 383 3448.4 1 803.84 89.281 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
carseats <- read.csv("/Users/ranjiang/Desktop/dis/dataset/Carseats.csv")
head(carseats)
## Sales CompPrice Income Advertising Population Price ShelveLoc Age Education
## 1 9.50 138 73 11 276 120 Bad 42 17
## 2 11.22 111 48 16 260 83 Good 65 10
## 3 10.06 113 35 10 269 80 Medium 59 12
## 4 7.40 117 100 4 466 97 Medium 55 14
## 5 4.15 141 64 3 340 128 Bad 38 13
## 6 10.81 124 113 13 501 72 Bad 78 16
## Urban US
## 1 Yes Yes
## 2 Yes Yes
## 3 Yes Yes
## 4 Yes Yes
## 5 Yes No
## 6 No Yes
dim(carseats)
## [1] 400 11
#CompPrice:Price charged by competitor at each location
#Advertising:Local advertising budget for company at each location
#Population:Population size in region
#Price:Price company charges for car seats at each site
#ShelveLoc::A factor with levels Bad, Good and Medium indicating the quality of the shelving location for the car seats at each site
#Urban-Yes or No
#US-Yes or No
lm.fitMul <- lm(Sales ~ Price + Urban + US, Carseats)
summary(lm.fitMul)
##
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9206 -1.6220 -0.0564 1.5786 7.0581
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.043469 0.651012 20.036 < 2e-16 ***
## Price -0.054459 0.005242 -10.389 < 2e-16 ***
## UrbanYes -0.021916 0.271650 -0.081 0.936
## USYes 1.200573 0.259042 4.635 4.86e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2335
## F-statistic: 41.52 on 3 and 396 DF, p-value: < 2.2e-16
# \hat{\beta_0} = 13.043469
# Sales is related negatively to Price.
# There isn't a relationship between Sales and Urban
# There is a relationship between Sales and whether the shop is in the US, and it's a positive relationship if "yes"
#Sales = 13.04 - 0.05 Price - 0.02 Urban + 1.20 US
#Urban: if Yes, then 1, else 0
#US: if Yes, then 1, else 0
#Price and US
lm.fitMul1 <- lm(Sales ~ Price + US, Carseats)
summary(lm.fitMul1)
##
## Call:
## lm(formula = Sales ~ Price + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9269 -1.6286 -0.0574 1.5766 7.0515
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.03079 0.63098 20.652 < 2e-16 ***
## Price -0.05448 0.00523 -10.416 < 2e-16 ***
## USYes 1.19964 0.25846 4.641 4.71e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.469 on 397 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2354
## F-statistic: 62.43 on 2 and 397 DF, p-value: < 2.2e-16
# Compute the RSE and R^2.
confint(lm.fitMul1)
## 2.5 % 97.5 %
## (Intercept) 11.79032020 14.27126531
## Price -0.06475984 -0.04419543
## USYes 0.69151957 1.70776632
set.seed (1)
x <- rnorm (100)
y <- 2 * x + rnorm (100)
lm.fit0re <- lm(y~x+0)
summary(lm.fit0re)
##
## Call:
## lm(formula = y ~ x + 0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.9154 -0.6472 -0.1771 0.5056 2.3109
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## x 1.9939 0.1065 18.73 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9586 on 99 degrees of freedom
## Multiple R-squared: 0.7798, Adjusted R-squared: 0.7776
## F-statistic: 350.7 on 1 and 99 DF, p-value: < 2.2e-16
#F-statistic>>0 and p-value is near 0 -> reject H_0
lm.fit0reReverse <- lm(x~y+0)
summary(lm.fit0reReverse)
##
## Call:
## lm(formula = x ~ y + 0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.8699 -0.2368 0.1030 0.2858 0.8938
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## y 0.39111 0.02089 18.73 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4246 on 99 degrees of freedom
## Multiple R-squared: 0.7798, Adjusted R-squared: 0.7776
## F-statistic: 350.7 on 1 and 99 DF, p-value: < 2.2e-16
#F-statistic>>0 and p-value is near 0 -> reject H_0
# In fact, these are the same line.
\[ SE(\hat{\beta}) = \sqrt{\frac{\sum_{i=1}^n(y_i-x_i\hat{\beta})^2}{(n-1)\sum_{i'=1}^n x_{i'}^2}} \] (These formulas are slightly different from those given in Sections 3.1.1 and 3.1.2, since here we are performing regression without an intercept.) Show algebraically, and confirm numerically in R, that the t-statistic can be written as
\[ \frac{(\sqrt{n-1})\sum_{i=1}^nx_iy_i}{\sqrt{(\sum_{i=1}^nx_i^2)(\sum_{i'=1}^ny_i^2)-(\sum_{i'=1}^nx_{i'}y_{i'})^2}} \]
n <- 100
beta <- sum(x*y) / sum(x^2)
seBeta <- sqrt(sum((y - x*beta)^2) / (99 * sum(x^2)))
t1<- beta/seBeta
t1
## [1] 18.72593
t2 <- (sqrt(n-1) * sum(x*y) ) / (sqrt(sum(x^2) * sum(y^2) - sum(x*y) ^ 2))
t2
## [1] 18.72593
#symmetric formula -> t(x,y)=t(y,x)
lm.fityx = lm(y~x+0)
lm.fitxy = lm(x~y+0)
summary(lm.fityx)
##
## Call:
## lm(formula = y ~ x + 0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.9154 -0.6472 -0.1771 0.5056 2.3109
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## x 1.9939 0.1065 18.73 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9586 on 99 degrees of freedom
## Multiple R-squared: 0.7798, Adjusted R-squared: 0.7776
## F-statistic: 350.7 on 1 and 99 DF, p-value: < 2.2e-16
summary(lm.fitxy)
##
## Call:
## lm(formula = x ~ y + 0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.8699 -0.2368 0.1030 0.2858 0.8938
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## y 0.39111 0.02089 18.73 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4246 on 99 degrees of freedom
## Multiple R-squared: 0.7798, Adjusted R-squared: 0.7776
## F-statistic: 350.7 on 1 and 99 DF, p-value: < 2.2e-16
#same t value
#\sum_i^n x_i^2 = \sum_i^n y_i^2
set.seed(520)
x <- rnorm(100)
y <- 5*x
lm.fit312b1 <- lm(y~x+0)
summary(lm.fit312b1)
## Warning in summary.lm(lm.fit312b1): essentially perfect fit: summary may be
## unreliable
##
## Call:
## lm(formula = y ~ x + 0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.948e-15 -1.349e-16 5.300e-18 1.676e-16 2.200e-14
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## x 5.00e+00 2.11e-16 2.369e+16 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.259e-15 on 99 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 5.614e+32 on 1 and 99 DF, p-value: < 2.2e-16
lm.fit312b2 <- lm(x~y+0)
summary(lm.fit312b2)
## Warning in summary.lm(lm.fit312b2): essentially perfect fit: summary may be
## unreliable
##
## Call:
## lm(formula = x ~ y + 0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.650e-16 -3.235e-17 -6.410e-18 3.276e-17 3.751e-16
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## y 2.000e-01 1.668e-18 1.199e+17 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.926e-17 on 99 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 1.438e+34 on 1 and 99 DF, p-value: < 2.2e-16
#coefficients are different
set.seed(520)
x <- rnorm(100)
y <- -sample(x, 100)
sum(x^2)
## [1] 114.5566
sum(y^2)
## [1] 114.5566
lm.s1 <- lm(y~x+0)
summary(lm.s1)
##
## Call:
## lm(formula = y ~ x + 0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.5840 -0.7309 -0.0813 0.6442 3.4051
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## x -0.06334 0.10030 -0.631 0.529
##
## Residual standard error: 1.074 on 99 degrees of freedom
## Multiple R-squared: 0.004012, Adjusted R-squared: -0.006049
## F-statistic: 0.3988 on 1 and 99 DF, p-value: 0.5292
lm.s2 <- lm(x~y+0)
summary(lm.s2)
##
## Call:
## lm(formula = x ~ y + 0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.2117 -0.6563 0.0336 0.7849 2.5734
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## y -0.06334 0.10030 -0.631 0.529
##
## Residual standard error: 1.074 on 99 degrees of freedom
## Multiple R-squared: 0.004012, Adjusted R-squared: -0.006049
## F-statistic: 0.3988 on 1 and 99 DF, p-value: 0.5292
set.seed(1)
x <- rnorm(100)
eps <- rnorm(100, 0, 0.25)
y <- -1+0.5*x+eps #\beta_0=-1, \beta_1=0.5
length(y) #100
## [1] 100
plot(x,y) #positive and linear relationship
lm.fit13e = lm(y~x)
summary(lm.fit13e)
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.46921 -0.15344 -0.03487 0.13485 0.58654
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.00942 0.02425 -41.63 <2e-16 ***
## x 0.49973 0.02693 18.56 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2407 on 98 degrees of freedom
## Multiple R-squared: 0.7784, Adjusted R-squared: 0.7762
## F-statistic: 344.3 on 1 and 98 DF, p-value: < 2.2e-16
#They're closed to the real coeffients
plot(x,y)
abline(lm.fit13e,col="red")
abline(-1,0.5,col="darkgreen")
legend(-2,0,legend = c("model","population"),col = c("red","darkgreen"),lty = 2:3, cex = 0.6)
lm.fitPoly <- lm(y~x+I(x^2))
summary(lm.fitPoly)
##
## Call:
## lm(formula = y ~ x + I(x^2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.4913 -0.1563 -0.0322 0.1451 0.5675
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.98582 0.02941 -33.516 <2e-16 ***
## x 0.50429 0.02700 18.680 <2e-16 ***
## I(x^2) -0.02973 0.02119 -1.403 0.164
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2395 on 97 degrees of freedom
## Multiple R-squared: 0.7828, Adjusted R-squared: 0.7784
## F-statistic: 174.8 on 2 and 97 DF, p-value: < 2.2e-16
# No, since the p-value is too large, we can't have enough evidence to reject H_0
set.seed(1)
x <- rnorm(100)
eps <- rnorm(100,0,0.01)
y <- -1+0.5*x+eps
lm.fit13h <- lm(y~x)
summary(lm.fit13h)
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.018768 -0.006138 -0.001395 0.005394 0.023462
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.0003769 0.0009699 -1031.5 <2e-16 ***
## x 0.4999894 0.0010773 464.1 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.009628 on 98 degrees of freedom
## Multiple R-squared: 0.9995, Adjusted R-squared: 0.9995
## F-statistic: 2.154e+05 on 1 and 98 DF, p-value: < 2.2e-16
plot(x,y,col = "skyblue")
abline(lm.fit13h,col="red")
abline(-1,0.5,col="darkgreen")
legend(-2,0,legend = c("model","population"),col = c("red","darkgreen"),lty = 2:3, cex = 0.6)
# Most of points are on the line
set.seed(1)
x <- rnorm(100)
eps <- rnorm(100,0,1.0)
y <- -1+0.5*x+eps
lm.fit13i <- lm(y~x)
summary(lm.fit13i)
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.8768 -0.6138 -0.1395 0.5394 2.3462
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.03769 0.09699 -10.699 < 2e-16 ***
## x 0.49894 0.10773 4.632 1.12e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9628 on 98 degrees of freedom
## Multiple R-squared: 0.1796, Adjusted R-squared: 0.1712
## F-statistic: 21.45 on 1 and 98 DF, p-value: 1.117e-05
plot(x,y,col = "orange")
abline(lm.fit13i,col="red")
abline(-1,0.5,col="darkgreen")
legend(-2,0,legend = c("model","population"),col = c("red","darkgreen"),lty = 2:3, cex = 0.6)
# Points are scattered
confint(lm.fit13e) #original
## 2.5 % 97.5 %
## (Intercept) -1.0575402 -0.9613061
## x 0.4462897 0.5531801
confint(lm.fit13h) #less noise
## 2.5 % 97.5 %
## (Intercept) -1.0023016 -0.9984522
## x 0.4978516 0.5021272
confint(lm.fit13i) #more noise
## 2.5 % 97.5 %
## (Intercept) -1.2301607 -0.8452245
## x 0.2851588 0.7127204
#More noise, wider range
set.seed (1)
x1 <- runif (100)
x2 <- 0.5 * x1 + rnorm (100) / 10
y <- 2 + 2 * x1 + 0.3 * x2 + rnorm (100)
The last line corresponds to creating a linear model in which y is a function of \(x_1\) and \(x_2\). Write out the form of the linear model. What are the regression coefficients?
# y = 2+2x_1+0.3x_2+\epsilon
# \beta_0 = 2, \beta_1 = 2, \beta_2 = 0.3
cor(x1,x2)
## [1] 0.8351212
plot(x1,x2)
lm.fit314c <- lm(y~x1+x2)
summary(lm.fit314c)
##
## Call:
## lm(formula = y ~ x1 + x2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8311 -0.7273 -0.0537 0.6338 2.3359
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.1305 0.2319 9.188 7.61e-15 ***
## x1 1.4396 0.7212 1.996 0.0487 *
## x2 1.0097 1.1337 0.891 0.3754
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.056 on 97 degrees of freedom
## Multiple R-squared: 0.2088, Adjusted R-squared: 0.1925
## F-statistic: 12.8 on 2 and 97 DF, p-value: 1.164e-05
#\hat{\beta}_0 = 2.1305
#\hat{\beta}_1 = 1.4396
#\hat{\beta}_2 = 1.0097
#Yes, since it's significant
#No, since p-value is too large
lm.fit314d <- lm(y~x1)
summary(lm.fit314d)
##
## Call:
## lm(formula = y ~ x1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.89495 -0.66874 -0.07785 0.59221 2.45560
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.1124 0.2307 9.155 8.27e-15 ***
## x1 1.9759 0.3963 4.986 2.66e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.055 on 98 degrees of freedom
## Multiple R-squared: 0.2024, Adjusted R-squared: 0.1942
## F-statistic: 24.86 on 1 and 98 DF, p-value: 2.661e-06
#Yes, the same reason
lm.fit314e <- lm(y~x2)
summary(lm.fit314e)
##
## Call:
## lm(formula = y ~ x2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.62687 -0.75156 -0.03598 0.72383 2.44890
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.3899 0.1949 12.26 < 2e-16 ***
## x2 2.8996 0.6330 4.58 1.37e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.072 on 98 degrees of freedom
## Multiple R-squared: 0.1763, Adjusted R-squared: 0.1679
## F-statistic: 20.98 on 1 and 98 DF, p-value: 1.366e-05
#Yes, the same reason
#b cor(x1,x2)=0.835
#No, since b reveal that x1 and x2 are collinear.
# If we have x1, x1 have already provide enough information, hence, x2 is not significant.
# Else if only x2, x2 needs provide some information
x1 <- c(x1 , 0.1)
x2 <- c(x2 , 0.8)
y <- c(y, 6)
Re-fit the linear models from (c) to (e) using this new data. What effect does this new observation have on the each of the models? In each model, is this observation an outlier? A high-leverage point? Both? Explain your answers.
lm.fitg1 <- lm(y~x1+x2)
summary(lm.fitg1)
##
## Call:
## lm(formula = y ~ x1 + x2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.73348 -0.69318 -0.05263 0.66385 2.30619
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.2267 0.2314 9.624 7.91e-16 ***
## x1 0.5394 0.5922 0.911 0.36458
## x2 2.5146 0.8977 2.801 0.00614 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.075 on 98 degrees of freedom
## Multiple R-squared: 0.2188, Adjusted R-squared: 0.2029
## F-statistic: 13.72 on 2 and 98 DF, p-value: 5.564e-06
par(mfrow = c(2,2))
plot(lm.fitg1) #high-level point
lm.fitg2 <- lm(y~x1)
summary(lm.fitg2)
##
## Call:
## lm(formula = y ~ x1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8897 -0.6556 -0.0909 0.5682 3.5665
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.2569 0.2390 9.445 1.78e-15 ***
## x1 1.7657 0.4124 4.282 4.29e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.111 on 99 degrees of freedom
## Multiple R-squared: 0.1562, Adjusted R-squared: 0.1477
## F-statistic: 18.33 on 1 and 99 DF, p-value: 4.295e-05
par(mfrow = c(2,2))
plot(lm.fitg2)#high-level point
lm.fitg3 <- lm(y~x2)
summary(lm.fitg3)
##
## Call:
## lm(formula = y ~ x2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.64729 -0.71021 -0.06899 0.72699 2.38074
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.3451 0.1912 12.264 < 2e-16 ***
## x2 3.1190 0.6040 5.164 1.25e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.074 on 99 degrees of freedom
## Multiple R-squared: 0.2122, Adjusted R-squared: 0.2042
## F-statistic: 26.66 on 1 and 99 DF, p-value: 1.253e-06
par(mfrow = c(2,2))
plot(lm.fitg3)#high-level point
Boston <- read.csv("/Users/ranjiang/Desktop/dis/dataset/Boston.csv")
names(Boston)
## [1] "X" "crim" "zn" "indus" "chas" "nox" "rm"
## [8] "age" "dis" "rad" "tax" "ptratio" "lstat" "medv"
#zn
lm.fitZn <- lm(crim~zn, Boston)
summary(lm.fitZn) #significant
##
## Call:
## lm(formula = crim ~ zn, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.429 -4.222 -2.620 1.250 84.523
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.45369 0.41722 10.675 < 2e-16 ***
## zn -0.07393 0.01609 -4.594 5.51e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.435 on 504 degrees of freedom
## Multiple R-squared: 0.04019, Adjusted R-squared: 0.03828
## F-statistic: 21.1 on 1 and 504 DF, p-value: 5.506e-06
#indus
lm.fitIndus <- lm(crim~indus, Boston)
summary(lm.fitIndus)#significant
##
## Call:
## lm(formula = crim ~ indus, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.972 -2.698 -0.736 0.712 81.813
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.06374 0.66723 -3.093 0.00209 **
## indus 0.50978 0.05102 9.991 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.866 on 504 degrees of freedom
## Multiple R-squared: 0.1653, Adjusted R-squared: 0.1637
## F-statistic: 99.82 on 1 and 504 DF, p-value: < 2.2e-16
#chas
lm.fitChas <- lm(crim~chas, Boston)
summary(lm.fitChas)
##
## Call:
## lm(formula = crim ~ chas, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.738 -3.661 -3.435 0.018 85.232
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.7444 0.3961 9.453 <2e-16 ***
## chas -1.8928 1.5061 -1.257 0.209
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.597 on 504 degrees of freedom
## Multiple R-squared: 0.003124, Adjusted R-squared: 0.001146
## F-statistic: 1.579 on 1 and 504 DF, p-value: 0.2094
#nox
lm.fitNox <- lm(crim~nox, Boston)
summary(lm.fitNox)#significant
##
## Call:
## lm(formula = crim ~ nox, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.371 -2.738 -0.974 0.559 81.728
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -13.720 1.699 -8.073 5.08e-15 ***
## nox 31.249 2.999 10.419 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.81 on 504 degrees of freedom
## Multiple R-squared: 0.1772, Adjusted R-squared: 0.1756
## F-statistic: 108.6 on 1 and 504 DF, p-value: < 2.2e-16
#rm
lm.fitRm <- lm(crim~rm, Boston)
summary(lm.fitRm)#significant
##
## Call:
## lm(formula = crim ~ rm, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.604 -3.952 -2.654 0.989 87.197
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20.482 3.365 6.088 2.27e-09 ***
## rm -2.684 0.532 -5.045 6.35e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.401 on 504 degrees of freedom
## Multiple R-squared: 0.04807, Adjusted R-squared: 0.04618
## F-statistic: 25.45 on 1 and 504 DF, p-value: 6.347e-07
#age
lm.fitAge <- lm(crim~age, Boston)
summary(lm.fitAge)#significant
##
## Call:
## lm(formula = crim ~ age, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.789 -4.257 -1.230 1.527 82.849
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.77791 0.94398 -4.002 7.22e-05 ***
## age 0.10779 0.01274 8.463 2.85e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.057 on 504 degrees of freedom
## Multiple R-squared: 0.1244, Adjusted R-squared: 0.1227
## F-statistic: 71.62 on 1 and 504 DF, p-value: 2.855e-16
#dis
lm.fitDis <- lm(crim~dis, Boston)
summary(lm.fitDis)#significant
##
## Call:
## lm(formula = crim ~ dis, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.708 -4.134 -1.527 1.516 81.674
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.4993 0.7304 13.006 <2e-16 ***
## dis -1.5509 0.1683 -9.213 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.965 on 504 degrees of freedom
## Multiple R-squared: 0.1441, Adjusted R-squared: 0.1425
## F-statistic: 84.89 on 1 and 504 DF, p-value: < 2.2e-16
#rad
lm.fitRad <- lm(crim~rad, Boston)
summary(lm.fitRad)#significant
##
## Call:
## lm(formula = crim ~ rad, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.164 -1.381 -0.141 0.660 76.433
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.28716 0.44348 -5.157 3.61e-07 ***
## rad 0.61791 0.03433 17.998 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.718 on 504 degrees of freedom
## Multiple R-squared: 0.3913, Adjusted R-squared: 0.39
## F-statistic: 323.9 on 1 and 504 DF, p-value: < 2.2e-16
#tax
lm.fitTax <- lm(crim~tax, Boston)
summary(lm.fitTax)#significant
##
## Call:
## lm(formula = crim ~ tax, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.513 -2.738 -0.194 1.065 77.696
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.528369 0.815809 -10.45 <2e-16 ***
## tax 0.029742 0.001847 16.10 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.997 on 504 degrees of freedom
## Multiple R-squared: 0.3396, Adjusted R-squared: 0.3383
## F-statistic: 259.2 on 1 and 504 DF, p-value: < 2.2e-16
#ptratio
lm.fitPtratio <- lm(crim~ptratio, Boston)
summary(lm.fitPtratio)#significant
##
## Call:
## lm(formula = crim ~ ptratio, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.654 -3.985 -1.912 1.825 83.353
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.6469 3.1473 -5.607 3.40e-08 ***
## ptratio 1.1520 0.1694 6.801 2.94e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.24 on 504 degrees of freedom
## Multiple R-squared: 0.08407, Adjusted R-squared: 0.08225
## F-statistic: 46.26 on 1 and 504 DF, p-value: 2.943e-11
#lstat
lm.fitLstat <- lm(crim~lstat, Boston)
summary(lm.fitLstat)#significant
##
## Call:
## lm(formula = crim ~ lstat, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.925 -2.822 -0.664 1.079 82.862
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.33054 0.69376 -4.801 2.09e-06 ***
## lstat 0.54880 0.04776 11.491 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.664 on 504 degrees of freedom
## Multiple R-squared: 0.2076, Adjusted R-squared: 0.206
## F-statistic: 132 on 1 and 504 DF, p-value: < 2.2e-16
#medv
lm.fitMedv <- lm(crim~medv, Boston)
summary(lm.fitMedv)#significant
##
## Call:
## lm(formula = crim ~ medv, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.071 -4.022 -2.343 1.298 80.957
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.79654 0.93419 12.63 <2e-16 ***
## medv -0.36316 0.03839 -9.46 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.934 on 504 degrees of freedom
## Multiple R-squared: 0.1508, Adjusted R-squared: 0.1491
## F-statistic: 89.49 on 1 and 504 DF, p-value: < 2.2e-16
lm.Boston <- lm(crim~., Boston[,-1])
summary(lm.Boston)
##
## Call:
## lm(formula = crim ~ ., data = Boston[, -1])
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.534 -2.248 -0.348 1.087 73.923
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.7783938 7.0818258 1.946 0.052271 .
## zn 0.0457100 0.0187903 2.433 0.015344 *
## indus -0.0583501 0.0836351 -0.698 0.485709
## chas -0.8253776 1.1833963 -0.697 0.485841
## nox -9.9575865 5.2898242 -1.882 0.060370 .
## rm 0.6289107 0.6070924 1.036 0.300738
## age -0.0008483 0.0179482 -0.047 0.962323
## dis -1.0122467 0.2824676 -3.584 0.000373 ***
## rad 0.6124653 0.0875358 6.997 8.59e-12 ***
## tax -0.0037756 0.0051723 -0.730 0.465757
## ptratio -0.3040728 0.1863598 -1.632 0.103393
## lstat 0.1388006 0.0757213 1.833 0.067398 .
## medv -0.2200564 0.0598240 -3.678 0.000261 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.46 on 493 degrees of freedom
## Multiple R-squared: 0.4493, Adjusted R-squared: 0.4359
## F-statistic: 33.52 on 12 and 493 DF, p-value: < 2.2e-16
#zn,dis,rad,medv
x <- c(coefficients(lm.fitZn)[2],
coefficients(lm.fitIndus)[2],
coefficients(lm.fitChas)[2],
coefficients(lm.fitNox)[2],
coefficients(lm.fitRm)[2],
coefficients(lm.fitAge)[2],
coefficients(lm.fitDis)[2],
coefficients(lm.fitRad)[2],
coefficients(lm.fitTax)[2],
coefficients(lm.fitPtratio)[2],
coefficients(lm.fitLstat)[2],
coefficients(lm.fitMedv)[2])
x
## zn indus chas nox rm age
## -0.07393498 0.50977633 -1.89277655 31.24853120 -2.68405122 0.10778623
## dis rad tax ptratio lstat medv
## -1.55090168 0.61791093 0.02974225 1.15198279 0.54880478 -0.36315992
y <- coefficients(lm.Boston)[2:13]
plot(x, y,col = "green2")
lm.zn <- lm(crim~poly(zn,3),Boston[,-1])
summary(lm.zn) # 1, 2
##
## Call:
## lm(formula = crim ~ poly(zn, 3), data = Boston[, -1])
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.821 -4.614 -1.294 0.473 84.130
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.6135 0.3722 9.709 < 2e-16 ***
## poly(zn, 3)1 -38.7498 8.3722 -4.628 4.7e-06 ***
## poly(zn, 3)2 23.9398 8.3722 2.859 0.00442 **
## poly(zn, 3)3 -10.0719 8.3722 -1.203 0.22954
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.372 on 502 degrees of freedom
## Multiple R-squared: 0.05824, Adjusted R-squared: 0.05261
## F-statistic: 10.35 on 3 and 502 DF, p-value: 1.281e-06
lm.indus <- lm(crim~poly(indus,3),Boston[,-1])
summary(lm.indus) # 1, 2, 3
##
## Call:
## lm(formula = crim ~ poly(indus, 3), data = Boston[, -1])
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.278 -2.514 0.054 0.764 79.713
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.614 0.330 10.950 < 2e-16 ***
## poly(indus, 3)1 78.591 7.423 10.587 < 2e-16 ***
## poly(indus, 3)2 -24.395 7.423 -3.286 0.00109 **
## poly(indus, 3)3 -54.130 7.423 -7.292 1.2e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.423 on 502 degrees of freedom
## Multiple R-squared: 0.2597, Adjusted R-squared: 0.2552
## F-statistic: 58.69 on 3 and 502 DF, p-value: < 2.2e-16
lm.nox <- lm(crim~poly(nox,3),Boston[,-1])
summary(lm.nox) # 1, 2, 3
##
## Call:
## lm(formula = crim ~ poly(nox, 3), data = Boston[, -1])
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.110 -2.068 -0.255 0.739 78.302
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.6135 0.3216 11.237 < 2e-16 ***
## poly(nox, 3)1 81.3720 7.2336 11.249 < 2e-16 ***
## poly(nox, 3)2 -28.8286 7.2336 -3.985 7.74e-05 ***
## poly(nox, 3)3 -60.3619 7.2336 -8.345 6.96e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.234 on 502 degrees of freedom
## Multiple R-squared: 0.297, Adjusted R-squared: 0.2928
## F-statistic: 70.69 on 3 and 502 DF, p-value: < 2.2e-16
lm.rm <- lm(crim~poly(rm,3),Boston[,-1])
summary(lm.rm) # 1, 2
##
## Call:
## lm(formula = crim ~ poly(rm, 3), data = Boston[, -1])
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.485 -3.468 -2.221 -0.015 87.219
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.6135 0.3703 9.758 < 2e-16 ***
## poly(rm, 3)1 -42.3794 8.3297 -5.088 5.13e-07 ***
## poly(rm, 3)2 26.5768 8.3297 3.191 0.00151 **
## poly(rm, 3)3 -5.5103 8.3297 -0.662 0.50858
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.33 on 502 degrees of freedom
## Multiple R-squared: 0.06779, Adjusted R-squared: 0.06222
## F-statistic: 12.17 on 3 and 502 DF, p-value: 1.067e-07
lm.age <- lm(crim~poly(age,3),Boston[,-1])
summary(lm.age) # 1, 2, 3
##
## Call:
## lm(formula = crim ~ poly(age, 3), data = Boston[, -1])
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.762 -2.673 -0.516 0.019 82.842
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.6135 0.3485 10.368 < 2e-16 ***
## poly(age, 3)1 68.1820 7.8397 8.697 < 2e-16 ***
## poly(age, 3)2 37.4845 7.8397 4.781 2.29e-06 ***
## poly(age, 3)3 21.3532 7.8397 2.724 0.00668 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.84 on 502 degrees of freedom
## Multiple R-squared: 0.1742, Adjusted R-squared: 0.1693
## F-statistic: 35.31 on 3 and 502 DF, p-value: < 2.2e-16
lm.dis <- lm(crim~poly(dis,3),Boston[,-1])
summary(lm.dis) # 1, 2, 3
##
## Call:
## lm(formula = crim ~ poly(dis, 3), data = Boston[, -1])
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.757 -2.588 0.031 1.267 76.378
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.6135 0.3259 11.087 < 2e-16 ***
## poly(dis, 3)1 -73.3886 7.3315 -10.010 < 2e-16 ***
## poly(dis, 3)2 56.3730 7.3315 7.689 7.87e-14 ***
## poly(dis, 3)3 -42.6219 7.3315 -5.814 1.09e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.331 on 502 degrees of freedom
## Multiple R-squared: 0.2778, Adjusted R-squared: 0.2735
## F-statistic: 64.37 on 3 and 502 DF, p-value: < 2.2e-16
lm.rad <- lm(crim~poly(rad,3),Boston[,-1])
summary(lm.rad) # 1, 2
##
## Call:
## lm(formula = crim ~ poly(rad, 3), data = Boston[, -1])
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.381 -0.412 -0.269 0.179 76.217
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.6135 0.2971 12.164 < 2e-16 ***
## poly(rad, 3)1 120.9074 6.6824 18.093 < 2e-16 ***
## poly(rad, 3)2 17.4923 6.6824 2.618 0.00912 **
## poly(rad, 3)3 4.6985 6.6824 0.703 0.48231
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.682 on 502 degrees of freedom
## Multiple R-squared: 0.4, Adjusted R-squared: 0.3965
## F-statistic: 111.6 on 3 and 502 DF, p-value: < 2.2e-16
lm.tax <- lm(crim~poly(tax,3),Boston[,-1])
summary(lm.tax) # 1, 2
##
## Call:
## lm(formula = crim ~ poly(tax, 3), data = Boston[, -1])
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.273 -1.389 0.046 0.536 76.950
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.6135 0.3047 11.860 < 2e-16 ***
## poly(tax, 3)1 112.6458 6.8537 16.436 < 2e-16 ***
## poly(tax, 3)2 32.0873 6.8537 4.682 3.67e-06 ***
## poly(tax, 3)3 -7.9968 6.8537 -1.167 0.244
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.854 on 502 degrees of freedom
## Multiple R-squared: 0.3689, Adjusted R-squared: 0.3651
## F-statistic: 97.8 on 3 and 502 DF, p-value: < 2.2e-16
lm.ptratio <- lm(crim~poly(ptratio,3),Boston[,-1])
summary(lm.ptratio) # 1, 2, 3
##
## Call:
## lm(formula = crim ~ poly(ptratio, 3), data = Boston[, -1])
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.833 -4.146 -1.655 1.408 82.697
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.614 0.361 10.008 < 2e-16 ***
## poly(ptratio, 3)1 56.045 8.122 6.901 1.57e-11 ***
## poly(ptratio, 3)2 24.775 8.122 3.050 0.00241 **
## poly(ptratio, 3)3 -22.280 8.122 -2.743 0.00630 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.122 on 502 degrees of freedom
## Multiple R-squared: 0.1138, Adjusted R-squared: 0.1085
## F-statistic: 21.48 on 3 and 502 DF, p-value: 4.171e-13
lm.lstat <- lm(crim~poly(lstat,3),Boston[,-1])
summary(lm.lstat) # 1, 2
##
## Call:
## lm(formula = crim ~ poly(lstat, 3), data = Boston[, -1])
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.234 -2.151 -0.486 0.066 83.353
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.6135 0.3392 10.654 <2e-16 ***
## poly(lstat, 3)1 88.0697 7.6294 11.543 <2e-16 ***
## poly(lstat, 3)2 15.8882 7.6294 2.082 0.0378 *
## poly(lstat, 3)3 -11.5740 7.6294 -1.517 0.1299
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.629 on 502 degrees of freedom
## Multiple R-squared: 0.2179, Adjusted R-squared: 0.2133
## F-statistic: 46.63 on 3 and 502 DF, p-value: < 2.2e-16
lm.medv <- lm(crim~poly(medv,3),Boston[,-1])
summary(lm.medv) # 1, 2, 3
##
## Call:
## lm(formula = crim ~ poly(medv, 3), data = Boston[, -1])
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.427 -1.976 -0.437 0.439 73.655
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.614 0.292 12.374 < 2e-16 ***
## poly(medv, 3)1 -75.058 6.569 -11.426 < 2e-16 ***
## poly(medv, 3)2 88.086 6.569 13.409 < 2e-16 ***
## poly(medv, 3)3 -48.033 6.569 -7.312 1.05e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.569 on 502 degrees of freedom
## Multiple R-squared: 0.4202, Adjusted R-squared: 0.4167
## F-statistic: 121.3 on 3 and 502 DF, p-value: < 2.2e-16
library(ISLR2)
head(Weekly)
## Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction
## 1 1990 0.816 1.572 -3.936 -0.229 -3.484 0.1549760 -0.270 Down
## 2 1990 -0.270 0.816 1.572 -3.936 -0.229 0.1485740 -2.576 Down
## 3 1990 -2.576 -0.270 0.816 1.572 -3.936 0.1598375 3.514 Up
## 4 1990 3.514 -2.576 -0.270 0.816 1.572 0.1616300 0.712 Up
## 5 1990 0.712 3.514 -2.576 -0.270 0.816 0.1537280 1.178 Up
## 6 1990 1.178 0.712 3.514 -2.576 -0.270 0.1544440 -1.372 Down
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
cor(Weekly[,-9])#Relationship between Volume and Lg1-5 are very small
## Year Lag1 Lag2 Lag3 Lag4
## Year 1.00000000 -0.032289274 -0.03339001 -0.03000649 -0.031127923
## Lag1 -0.03228927 1.000000000 -0.07485305 0.05863568 -0.071273876
## Lag2 -0.03339001 -0.074853051 1.00000000 -0.07572091 0.058381535
## Lag3 -0.03000649 0.058635682 -0.07572091 1.00000000 -0.075395865
## Lag4 -0.03112792 -0.071273876 0.05838153 -0.07539587 1.000000000
## Lag5 -0.03051910 -0.008183096 -0.07249948 0.06065717 -0.075675027
## Volume 0.84194162 -0.064951313 -0.08551314 -0.06928771 -0.061074617
## Today -0.03245989 -0.075031842 0.05916672 -0.07124364 -0.007825873
## Lag5 Volume Today
## Year -0.030519101 0.84194162 -0.032459894
## Lag1 -0.008183096 -0.06495131 -0.075031842
## Lag2 -0.072499482 -0.08551314 0.059166717
## Lag3 0.060657175 -0.06928771 -0.071243639
## Lag4 -0.075675027 -0.06107462 -0.007825873
## Lag5 1.000000000 -0.05851741 0.011012698
## Volume -0.058517414 1.00000000 -0.033077783
## Today 0.011012698 -0.03307778 1.000000000
plot(Weekly$Volume,xlab = "time", ylab = "Volume",col = "cyan4")
glm.fit <- glm(Direction ~ Lag1+Lag2+Lag3+Lag4+Lag5+Volume, data=Weekly, family= binomial)
summary(glm.fit)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
## Number of Fisher Scoring iterations: 4
#Lag2 is significant.
coef(glm.fit)
## (Intercept) Lag1 Lag2 Lag3 Lag4 Lag5
## 0.26686414 -0.04126894 0.05844168 -0.01606114 -0.02779021 -0.01447206
## Volume
## -0.02274153
summary(glm.fit)$coef
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686414 0.08592961 3.1056134 0.001898848
## Lag1 -0.04126894 0.02641026 -1.5626099 0.118144368
## Lag2 0.05844168 0.02686499 2.1753839 0.029601361
## Lag3 -0.01606114 0.02666299 -0.6023760 0.546923890
## Lag4 -0.02779021 0.02646332 -1.0501409 0.293653342
## Lag5 -0.01447206 0.02638478 -0.5485006 0.583348244
## Volume -0.02274153 0.03689812 -0.6163330 0.537674762
dim(Weekly) #1089,9
## [1] 1089 9
#compute probs
glm.probs <- predict(glm.fit,type = "response")
glm.probs[1:10] #first 10 probs
## 1 2 3 4 5 6 7 8
## 0.6086249 0.6010314 0.5875699 0.4816416 0.6169013 0.5684190 0.5786097 0.5151972
## 9 10
## 0.5715200 0.5554287
contrasts(Weekly$Direction) #1 means up
## Up
## Down 0
## Up 1
#prediction
glm.pred <- rep("Down",1089)
glm.pred[glm.probs>0.5]="Up"
#confusion matrix
table(glm.pred, Weekly$Direction)
##
## glm.pred Down Up
## Down 54 48
## Up 430 557
#False classification rate
totalFalseRate <- (141+457)/1089
totalFalseRate
## [1] 0.5491276
train <- subset(Weekly,Year<=2008)
test <- subset(Weekly,Year>2008)
dim(train) #985,9
## [1] 985 9
dim(test)#104 9
## [1] 104 9
#Log2 is only predictor
glm.fitTrain <- glm(Direction ~ Lag2, data=train, family= binomial)
summary(glm.fitTrain)
##
## Call:
## glm(formula = Direction ~ Lag2, family = binomial, data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.536 -1.264 1.021 1.091 1.368
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.20326 0.06428 3.162 0.00157 **
## Lag2 0.05810 0.02870 2.024 0.04298 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1354.7 on 984 degrees of freedom
## Residual deviance: 1350.5 on 983 degrees of freedom
## AIC: 1354.5
##
## Number of Fisher Scoring iterations: 4
#compute test probs
glm.probsTest <- predict(glm.fitTrain, test, type = "response")
glm.probsTest[1:10] #first 10 probs
## 986 987 988 989 990 991 992 993
## 0.5261291 0.6447364 0.4862159 0.4852001 0.5197667 0.5401255 0.6233482 0.4809930
## 994 995
## 0.4512204 0.4848808
#prediction
glm.predTest <- rep("Down",104)
glm.predTest[glm.probsTest>0.5]="Up"
#confusion matrix
table(glm.predTest, test$Direction)
##
## glm.predTest Down Up
## Down 9 5
## Up 34 56
#True classification rate
mean(glm.predTest == test$Direction)
## [1] 0.625
train <- subset(Weekly,Year<=2008)
test <- subset(Weekly,Year>2008)
dim(train) #985,9
## [1] 985 9
dim(test)#104 9
## [1] 104 9
library(MASS)
##
## 载入程辑包:'MASS'
## The following object is masked _by_ '.GlobalEnv':
##
## Boston
## The following object is masked from 'package:ISLR2':
##
## Boston
#Log2 is only predictor,lda
lda.fitTrain <- lda(Direction ~ Lag2, data = train)
lda.fitTrain
## Call:
## lda(Direction ~ Lag2, data = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
##
## Coefficients of linear discriminants:
## LD1
## Lag2 0.4414162
#plot(lda.fitTrain, col="skyblue")
#prediction
lda.pred <- predict(lda.fitTrain,test)
#confusion matrix
table(lda.pred$class, test$Direction)
##
## Down Up
## Down 9 5
## Up 34 56
#True classification rate
mean(lda.pred$class == test$Direction)
## [1] 0.625
train <- subset(Weekly,Year<=2008)
test <- subset(Weekly,Year>2008)
dim(train) #985,9
## [1] 985 9
dim(test)#104 9
## [1] 104 9
#Log2 is only predictor,lda
qda.fitTrain <- qda(Direction ~ Lag2, data = train)
qda.fitTrain
## Call:
## qda(Direction ~ Lag2, data = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2
## Down -0.03568254
## Up 0.26036581
#plot(lda.fitTrain, col="skyblue")
#prediction
qda.class <- predict(qda.fitTrain, test)$class
#confusion matrix
table(qda.class, test$Direction)
##
## qda.class Down Up
## Down 0 0
## Up 43 61
#True classification rate
mean(qda.class == test$Direction)
## [1] 0.5865385
library(class)
train <- (Weekly$Year<=2008)
train.X <- as.matrix(Weekly$Lag2[train])
test.X <- as.matrix(Weekly$Lag2[!train])
train.Direction <- Weekly$Direction[train]
set.seed(2)
knn.pred <- knn(train.X,test.X,train.Direction,k=1)
table(knn.pred,test$Direction)
##
## knn.pred Down Up
## Down 21 30
## Up 22 31
#True classification rate
mean(knn.pred == test$Direction)
## [1] 0.5
library(klaR)
train <- subset(Weekly,Year<=2008)
test <- subset(Weekly,Year>2008)
dim(train) #985,9
## [1] 985 9
dim(test)#104 9
## [1] 104 9
#fit model
bayes.fit <- NaiveBayes(Direction~Lag2,train)
#prediction
bayes.pred <- predict(bayes.fit,test)
#confusion matrix
table(bayes.pred$class,test$Direction)
##
## Down Up
## Down 0 0
## Up 43 61
#True classification rate
mean(bayes.pred$class==test$Direction)
## [1] 0.5865385
#Logistic regression and LDA
######### Logistic regression #########
#(fit by Log2+Volume0.54,Lag1+Lag2+Volume0.53,Lag1+Lag2:0.58,Lag2+Lag3:0.625)
train <- subset(Weekly,Year<=2008)
test <- subset(Weekly,Year>2008)
#Log2+Volume are predictors
glm.fitTrain <- glm(Direction ~ Lag2+Lag3, data=train, family= binomial)
summary(glm.fitTrain)
##
## Call:
## glm(formula = Direction ~ Lag2 + Lag3, family = binomial, data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.569 -1.260 1.019 1.093 1.356
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.20501 0.06442 3.182 0.00146 **
## Lag2 0.05707 0.02878 1.983 0.04740 *
## Lag3 -0.01216 0.02862 -0.425 0.67088
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1354.7 on 984 degrees of freedom
## Residual deviance: 1350.4 on 982 degrees of freedom
## AIC: 1356.4
##
## Number of Fisher Scoring iterations: 4
#compute test probs
glm.probsTest <- predict(glm.fitTrain, test, type = "response")
glm.probsTest[1:10] #first 10 probs
## 986 987 988 989 990 991 992 993
## 0.5241920 0.6482725 0.4672852 0.5003146 0.5344438 0.5471932 0.6245983 0.4669750
## 994 995
## 0.4679244 0.5073575
#prediction
glm.predTest <- rep("Down",104)
glm.predTest[glm.probsTest>0.5]="Up"
#confusion matrix
table(glm.predTest, test$Direction)
##
## glm.predTest Down Up
## Down 8 4
## Up 35 57
#True classification rate
mean(glm.predTest == test$Direction)
## [1] 0.625
######### LDA #########
library(MASS)
#Log2+Lag3 0.63
lda.fitTrain <- lda(Direction ~ Lag2+Lag3, data = train)
lda.fitTrain
## Call:
## lda(Direction ~ Lag2 + Lag3, data = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2 Lag3
## Down -0.03568254 0.17080045
## Up 0.26036581 0.08404044
##
## Coefficients of linear discriminants:
## LD1
## Lag2 0.42459797
## Lag3 -0.08880475
#plot(lda.fitTrain, col="skyblue")
#prediction
lda.pred <- predict(lda.fitTrain,test)
#confusion matrix
table(lda.pred$class, test$Direction)
##
## Down Up
## Down 8 4
## Up 35 57
#True classification rate
mean(lda.pred$class == test$Direction)
## [1] 0.625
######### QDA #########
#Lag2+Lag3 0.61
qda.fitTrain <- qda(Direction ~ Lag2+Lag3, data = train)
qda.fitTrain
## Call:
## qda(Direction ~ Lag2 + Lag3, data = train)
##
## Prior probabilities of groups:
## Down Up
## 0.4477157 0.5522843
##
## Group means:
## Lag2 Lag3
## Down -0.03568254 0.17080045
## Up 0.26036581 0.08404044
#plot(lda.fitTrain, col="skyblue")
#prediction
qda.class <- predict(qda.fitTrain, test)$class
#confusion matrix
table(qda.class, test$Direction)
##
## qda.class Down Up
## Down 4 2
## Up 39 59
#True classification rate
mean(qda.class == test$Direction)
## [1] 0.6057692
######### KNN #########
library(class)
train <- (Weekly$Year<=2008)
train.X <- as.matrix(Weekly$Lag2[train])
test.X <- as.matrix(Weekly$Lag2[!train])
train.Direction <- Weekly$Direction[train]
set.seed(2)
knn.pred <- knn(train.X,test.X,train.Direction,k=180) #k=180:0.625
table(knn.pred,test$Direction)
##
## knn.pred Down Up
## Down 9 5
## Up 34 56
#True classification rate
mean(knn.pred == test$Direction)
## [1] 0.625
mpg01 <- rep(0,length(cleanedAuto$mpg))
mpg01[cleanedAuto$mpg > median(cleanedAuto$mpg)] <- 1
Auto <- data.frame(cleanedAuto,mpg01)
Auto$horsepower <- as.numeric(Auto$horsepower)
temp <- Auto
temp$name <-as.factor(Auto$name)
pairs(temp,col = "lemonchiffon2" )
names(Auto)
## [1] "mpg" "cylinders" "displacement" "horsepower" "weight"
## [6] "acceleration" "year" "origin" "name" "mpg01"
#divide by whether is even number
train <- (Auto$year %% 2 == 0)
train_x <- Auto[train,]
test_x <- Auto[!train,]
library(MASS)
lda.train <- lda(mpg01 ~ displacement+horsepower+weight+acceleration+year, data = train_x)
lda.train
## Call:
## lda(mpg01 ~ displacement + horsepower + weight + acceleration +
## year, data = train_x)
##
## Prior probabilities of groups:
## 0 1
## 0.4571429 0.5428571
##
## Group means:
## displacement horsepower weight acceleration year
## 0 271.7396 133.14583 3604.823 14.47500 74.10417
## 1 111.6623 77.92105 2314.763 16.62895 77.78947
##
## Coefficients of linear discriminants:
## LD1
## displacement -0.008311844
## horsepower 0.014846052
## weight -0.001583211
## acceleration 0.002468036
## year 0.105990658
#plot(lda.fitTrain, col="skyblue")
#prediction
lda.pred <- predict(lda.train,test_x)
lda.pred$class
## [1] 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0
## [38] 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 1 1 1 1 1 0 0 1 1 1 1 0 1 1 0 0 0 0 0 0 0 0
## [75] 0 0 0 0 0 0 0 0 1 1 0 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0
## [112] 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 1
## [149] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 0
## Levels: 0 1
#confusion matrix
table(lda.pred$class, test_x$mpg01)
##
## 0 1
## 0 84 6
## 1 16 76
#Test error = 0.1195652
mean(lda.pred$class != test_x$mpg01)
## [1] 0.1208791
qda.train <- qda(mpg01 ~ displacement+horsepower+weight+acceleration+year, data = train_x)
#prediction
qda.class <- predict(qda.train, test_x)$class
#confusion matrix
table(qda.class, test_x$mpg01)
##
## qda.class 0 1
## 0 89 8
## 1 11 74
#Test error = 0.1032609
mean(qda.class != test_x$mpg01)
## [1] 0.1043956
glm.train <- glm(mpg01 ~ displacement+horsepower+weight+acceleration+year, data = train_x)
#prob
glm.prob <- predict(glm.train,test_x, type = "response")
#prediction
glm.pred2 <- rep(0,nrow(test_x))
glm.pred2[glm.prob>0.5]=1
#confusion matrix
table(glm.pred2,test_x$mpg01)
##
## glm.pred2 0 1
## 0 84 6
## 1 16 76
#test error = 0.1195652
mean(glm.pred2 != test_x$mpg01)
## [1] 0.1208791
library(e1071)
m <- naiveBayes(train_x, train_x$mpg01, laplace = 0)
p <- predict(m, test_x, type="class")
#confusion matrix
table(p,test_x$mpg01)
##
## p 0 1
## 0 98 4
## 1 2 78
#Test error
mean(p != test_x$mpg01)
## [1] 0.03296703
library(class)
train_knn <- as.matrix(Auto$year[train])
test_knn <- as.matrix(Auto$year[!train])
train_class <- Auto$mpg01[train]
test_class <- Auto$mpg01[!train]
set.seed(5)
pred_knn <- knn(train_knn,test_knn,train_class,k=3)
table(pred_knn,test_class)
## test_class
## pred_knn 0 1
## 0 84 41
## 1 16 41
#Test error = 0.3131868
mean(pred_knn != test_class)
## [1] 0.3131868
#normal
Power <- function(x){
return(x*x*x)
}
#use print()
Power <- function(x){
print(x*x*x)
}
Power(2)
## [1] 8
#Power2 <- function(x,n){
# result = 1;
# while (n>0) {
# result = result * x;
# n=n-1;
# }
# return(result)
#}
Power2 <- function(x,n){
result = 1;
while (n>0) {
result = result * x;
n=n-1;
}
print(result)
}
Power2(3,8)
## [1] 6561
Power2(10,3)
## [1] 1000
Power2(8,17)
## [1] 2.2518e+15
Power2(131,3)
## [1] 2248091
Power3 <- function(x,n){
result = 1;
while (n>0) {
result = result * x;
n=n-1;
}
return(result)
}
par(mfrow = c(1,2))
plot(seq(1,10,0.1),Power3(seq(1,10,0.1),2), xlab = "x", ylab = "x^2",col = "cyan3" )
plot(log(seq(1,10,0.1)),log(Power3(seq(1,10,0.1),2)), xlab = "logx", ylab = "logy" ,col="orange")
PlotPower <- function(x,n){
a= range(x)[1]
b= range(x)[2]
plot(seq(a,b),Power3(seq(a,b),n), xlab = "x", ylab = "y",col = "cyan4")
}
PlotPower(1:10,3)
#head(Boston)
Boston <- read.csv("/Users/ranjiang/Desktop/dis/dataset/Boston.csv")
library(glmnet)
## 载入需要的程辑包:Matrix
## Loaded glmnet 4.1-3
crime01 <- rep(0,length(Boston$crim))
crime01[Boston$crim>median(Boston$crim)] <- 1
boston <- data.frame(Boston,crime01)
#boston$crime01 <- as.factor(boston$crime01)
train_x <- boston[1:nrow(boston)/2,]
test_x <- boston[nrow(boston)/2+1:nrow(boston),]
test_class <- crime01[nrow(boston)/2+1:nrow(boston)]
######### logistic regression #########
#delete zn
glm.fit <- glm(crime01 ~ indus+nox+rm+age+dis+rad+tax,data = train_x,family = "binomial")
#prob
glm.prob <- predict(glm.fit,test_x, type = "response")
#prediction
glm.pred <- rep(0,nrow(test_x))
glm.pred[glm.prob>0.5]=1
#confusion matrix
table(glm.pred,test_class)
## test_class
## glm.pred 0 1
## 0 75 8
## 1 15 155
#test error = 0.3003953
(75+1)/(75+155+8+15)
## [1] 0.3003953
######### LDA #########
library(MASS)
lda.train <- lda(crime01 ~ indus+nox+dis+rad+tax, data = train_x)
#prediction
lda.pred <- predict(lda.train,test_x)
#confusion matrix
table(lda.pred$class, test_class)
## test_class
## 0 1
## 0 80 18
## 1 10 145
#test error = 0.1106719
28/(28+80+145)
## [1] 0.1106719
######### Naive Bayes #########
library(e1071)
m <- naiveBayes(train_x, train_x$crime01, laplace = 0)
p <- predict(m, test_x, type="class")
#confusion matrix
table(p,test_class)
## test_class
## p 0 1
## 0 88 65
## 1 2 98
#Test error = 0.2648221
67/(88+65+2+98)
## [1] 0.2648221